The aim of this project is to investigate the association between AF and lifestyle risk factor and develop a lifestyle risk score to predict AF.
library(data.table)
library(ggplot2)
library(tidyverse)
library(ggthemes)
library(cowplot)
library(fastDummies)
library(flextable)
library(officer)
library(corrplot)
library(glmnet)
library(pROC)
library(rms) #for nomogram
library(knitr) #for include_graphics
library(umap) #umap plots
library(cutpointr) #ROC cutpoint analysis
library(MLmetrics) #to do logloss/f1 scores over glm model
library(DescTools) #to compute Brier score
library(riskRegression) # to do calibrarion plotsWe randomly divided the filtered data into a testing (70%) and validation set (30%).
validating_score <- function(dataset, model, label) {
x <- computeScores(dataset, model)
lr <- glm("AF_Status ~ scale(score)", family = binomial, data = x)
lr_or <- get_ors(lr)
#AUC
#AUC(y_pred = lr$fitted.values, y_true = x$AF_Status) #using MLmetrics
pred <- predict(lr, x)
ro <- pROC::roc(x$AF_Status, pred)
au <- as.numeric(pROC::auc(ro))
ci <- as.numeric(pROC::ci(ro))
auc_l <- format(round(ci[1], 3), nsmall = 3)
auc_h <- format(round(ci[3], 3), nsmall = 3)
auc <- format(round(au, 3), nsmall = 3)
#Logloss
ll <- LogLoss(y_pred = lr$fitted.values, y_true=x$AF_Status)
ll <- format(round(ll, 3), nsmall = 3)
#Brier score
br <- BrierScore(lr)
br <- format(round(br, 3), nsmall = 3)
auc_info <- c(label, paste0(auc, " (", auc_l, ", ", auc_h, ")"), ll, br)
return(list(auc_info, ro))
}
validating_manual_score <- function(dataset) {
#create columns for MODIFY models
dataset$age_65_74 <- ifelse(dataset$age >= 65 & dataset$age < 75, 1, 0)
dataset$age_75 <- ifelse(dataset$age > 74, 1, 0)
dataset$age_60_74 <- ifelse(dataset$age >= 60 & dataset$age < 75, 1, 0)
dataset$age_60_64 <- ifelse(dataset$age >= 60 & dataset$age < 64, 1, 0)
dataset$age_65 <- ifelse(dataset$age > 64, 1, 0)
dataset$Alcohol_6_15 <- ifelse(dataset$Alcohol_sdd > 5 & dataset$Alcohol_sdd <= 15, 1, 0)
dataset$Alcohol_16_50 <- ifelse(dataset$Alcohol_sdd > 15, 1, 0)
model_au_or <- list()
for(i in 1:length(MODIFYAF)) {
x <- computeScores(dataset, MODIFYAF[[i]])
lr <- glm("AF_Status ~ scale(score)", family = binomial, data = x)
lr_or <- get_ors(lr)
#AUC
pred <- predict(lr, x)
ro <- pROC::roc(x$AF_Status, pred)
au <- as.numeric(pROC::auc(ro))
ci <- as.numeric(pROC::ci(ro))
auc_l <- format(round(ci[1], 3), nsmall = 3)
auc_h <- format(round(ci[3], 3), nsmall = 3)
auc <- format(round(au, 3), nsmall = 3)
#Logloss
ll <- LogLoss(y_pred = lr$fitted.values, y_true=x$AF_Status)
ll <- format(round(ll, 3), nsmall = 3)
#Brier score
br <- BrierScore(lr)
br <- format(round(br, 3), nsmall = 3)
auc_info <- c(paste0("MODIFYAF ", i),
paste0(auc, " (", auc_l, ", ", auc_h, ")"), ll, br)
model_au_or[[i]] <- list(auc_info, ro)
}
return(model_au_or)
}
plot_multiroc <- function(l_rocs, l_labels) {
roc_data <- data.frame(sensitiviy = character(),
specificity = character(),
Model = character(),
stringsAsFactors=FALSE)
for(i in 1:length(l_rocs)) {
x <- data.frame(sensitiviy = l_rocs[[i]]$sensitivities,
specificity = l_rocs[[i]]$specificities, stringsAsFactors=FALSE)
#Get AUC
au <- as.numeric(pROC::auc(l_rocs[[i]])) * 100
au <- format(round(au, 1), nsmall = 1)
x$Model = paste0(l_labels[[i]], " (AUC: ", au, "%)")
roc_data <- rbind(roc_data, x)
}
pl <- ggplot(data=roc_data, aes(x=1-specificity, y=sensitiviy, color=Model)) +
geom_line(size=2) + theme_bw() +
scale_x_continuous(name="1 - Specificity") +
scale_y_continuous(name="Sensitivity") +
theme(legend.position = c(0.8, 0.2),
legend.margin = margin(0.2, 0.2, 0.2, 0.2, "cm"),
legend.text = element_text(size=12),
legend.title = element_text(size=14),
legend.background = element_blank(),
legend.box.background = element_rect(colour = "black"))
return(pl)
}
mod_hist_plot <- function(dataset) {
pdata <- dataset %>%
group_by(score) %>% tally() %>%
mutate(per=round(prop.table(n) * 100, 2))
af_data <- dataset[dataset$AF_Status==1, ] %>%
group_by(score) %>% tally() %>%
mutate(per=round(prop.table(n) * 100, 2))
naf_data <- dataset[dataset$AF_Status==0, ] %>%
group_by(score) %>% tally() %>%
mutate(per=round(prop.table(n) * 100, 2))
d <- list(pdata, af_data, naf_data)
color <- c("steelblue", "#e34a33", "#a1d99b")
ys <- c("Individuals (%)", "AF Individuals (%)", "No AF Individuals (%)")
pl <- list()
for(i in 1:length(d)) {
p <- ggplot(d[[i]], aes(x=score, y=per)) +
geom_bar(stat="identity", fill=color[i]) +
labs(x="MODIFY_AF risk score", y= ys[i]) +
theme_minimal()
pl[[i]] <- p
}
pl <- plot_grid(pl[[1]], pl[[2]], pl[[3]], ncol=1)
return(pl)
}
mod_table_ind <- function(dataset) {
pdata <- dataset %>%
group_by(score) %>% tally()
af_data <- dataset[dataset$AF_Status==1, ] %>%
group_by(score) %>% tally()
colnames(af_data) <- c('score','AF')
pdata <- merge(pdata, af_data, by='score')
naf_data <- dataset[dataset$AF_Status==0, ] %>%
group_by(score) %>% tally()
colnames(naf_data) <- c('score','NoAF')
pdata <- merge(pdata, naf_data, by='score')
tab_pr <- flextable(pdata) %>%
width(j=1, width = 3) %>% #width of each column
width(j=c(2:4), width = 2)
tab_pr <- tab_pr %>% set_header_labels(Score="Score", n="Total Individuals",
AF="AF cases", NoAF="Controls")
# alignment
tab_pr <- tab_pr %>% flextable::align(align = "center", j = c(2:4), part = "all")
# font style
tab_pr <- tab_pr %>%
style(pr_t=fp_text(font.family= "roboto")) %>%
fontsize(i = 1, size = 12, part = "header") %>% # adjust font size of header
bold(i = 1, bold = TRUE, part = "header") %>% # adjust bold face of header
fontsize(part="body", size = 11) %>%
bold(j = 1, bold = TRUE, part = "body")
return(tab_pr)
}mytab <- NULL
{
t <- generate_table(test_ss)
mytab <- demo_table(t)
mytab <- mytab %>% bold(i = c(1:12,17,21,25), j = 1, bold = TRUE, part = "body")
}
mytabAF | No AF | Total | |
Individuals | 11258 | 200790 | 212048 |
Female, n(%) | 3670 (32.6) | 106510 (53.0) | 110180 (52.0) |
Death, n(%) | 2186 (19.4) | 10026 (5.0) | 12212 (5.8) |
Hypertension | 7446 (66.1) | 47752 (23.8) | 55198 (26.0) |
Diabetes | 1051 (9.3) | 8339 (4.2) | 9390 (4.4) |
Sleep Apnoea | 605 (5.4) | 2930 (1.5) | 3535 (1.7) |
Physical Inactivity, n(%) | 2162 (19.2) | 36023 (17.9) | 38185 (18.0) |
Time to AF diag. (median (IQR)) | 7.2 (4.2-9.8) | -- | -- |
BMI, in kg/m2 (median (IQR)) | 27.9 (25.2-31.2) | 26.4 (23.9-29.3) | 26.4 (24.0-29.4) |
Alcohol, in sddw (median (IQR)) | 12.0 (5.0-23.0) | 10.5 (4.5-20.0) | 10.5 (4.5-20.0) |
Age (median (IQR)) | 63.0 (59.0-67.0) | 57.0 (49.0-62.0) | 57.0 (50.0-63.0) |
Age group, n(%) |
|
|
|
older male (≥65) | 3138 (27.9) | 16822 (8.4) | 19960 (9.4) |
older female (≥ 65) | 1493 (13.3) | 16143 (8.0) | 17636 (8.3) |
younger male (<65) | 4450 (39.5) | 77458 (38.6) | 81908 (38.6) |
younger female (<65) | 2177 (19.3) | 90367 (45.0) | 92544 (43.6) |
Obesity, n(%) |
|
|
|
Low | 4707 (41.8) | 112971 (56.3) | 117678 (55.5) |
Medium | 2920 (25.9) | 46134 (23.0) | 49054 (23.1) |
High | 3631 (32.3) | 41685 (20.8) | 45316 (21.4) |
Smoking, n(%) |
|
|
|
Never | 5057 (44.9) | 111957 (55.8) | 117014 (55.2) |
Previous | 5128 (45.5) | 70046 (34.9) | 75174 (35.5) |
Current | 1073 (9.5) | 18787 (9.4) | 19860 (9.4) |
Ethnic Background |
|
|
|
Caucasian | 10000 (88.8) | 169711 (84.5) | 179711 (84.8) |
Other | 1258 (11.2) | 31079 (15.5) | 32337 (15.2) |
mytab <- NULL
{
t <- generate_table(validation_ss)
mytab <- demo_table(t)
mytab <- mytab %>% bold(i = c(1:12,17,21,25), j = 1, bold = TRUE, part = "body")
}
mytabAF | No AF | Total | |
Individuals | 4771 | 86107 | 90878 |
Female, n(%) | 1536 (32.2) | 45533 (52.9) | 47069 (51.8) |
Death, n(%) | 946 (19.8) | 4248 (4.9) | 5194 (5.7) |
Hypertension | 3159 (66.2) | 20399 (23.7) | 23558 (25.9) |
Diabetes | 470 (9.9) | 3486 (4.0) | 3956 (4.4) |
Sleep Apnoea | 269 (5.6) | 1281 (1.5) | 1550 (1.7) |
Physical Inactivity, n(%) | 914 (19.2) | 15316 (17.8) | 16230 (17.9) |
Time to AF diag. (median (IQR)) | 7.3 (4.4-9.8) | -- | -- |
BMI, in kg/m2 (median (IQR)) | 27.9 (25.1-31.2) | 26.4 (23.9-29.3) | 26.5 (24.0-29.4) |
Alcohol, in sddw (median (IQR)) | 12.8 (5.5-23.5) | 10.5 (4.5-20.0) | 10.8 (4.5-20.0) |
Age (median (IQR)) | 63.0 (60.0-67.0) | 57.0 (49.0-63.0) | 57.0 (50.0-63.0) |
Age group, n(%) |
|
|
|
older male (≥65) | 1341 (28.1) | 7265 (8.4) | 8606 (9.5) |
older female (≥ 65) | 655 (13.7) | 6881 (8.0) | 7536 (8.3) |
younger male (<65) | 1894 (39.7) | 33309 (38.7) | 35203 (38.7) |
younger female (<65) | 881 (18.5) | 38652 (44.9) | 39533 (43.5) |
Obesity, n(%) |
|
|
|
Low | 2002 (42.0) | 48271 (56.1) | 50273 (55.3) |
Medium | 1213 (25.4) | 19848 (23.1) | 21061 (23.2) |
High | 1556 (32.6) | 17988 (20.9) | 19544 (21.5) |
Smoking, n(%) |
|
|
|
Never | 2030 (42.5) | 47767 (55.5) | 49797 (54.8) |
Previous | 2253 (47.2) | 30367 (35.3) | 32620 (35.9) |
Current | 488 (10.2) | 7973 (9.3) | 8461 (9.3) |
Ethnic Background |
|
|
|
Caucasian | 4193 (87.9) | 72896 (84.7) | 77089 (84.8) |
Other | 578 (12.1) | 13211 (15.3) | 13789 (15.2) |
time_to_af_plot(test_ss)time_to_af_plot(validation_ss)In this work we considered the following variables:
From this point we will perform all the analysis over the Test cohort and will only use the validation cohort at the end to validate the model generated.
test_ss$AF <- as.factor(test_ss$AF_Status)
summary_continuous(indata=test_ss, field="bmi",type="num",xlab="BMI (Kg/m2)",
leg_pos=c(0.9,0.7), fillvar = "AF", fillname = "AF Status",
rel_heights = c(5.5,4.5,3,2.5), boxcol="white") #xlim=c(min_t, max_t)cp <- cutpointr(data=test_ss, x=bmi, class=AF,
method=maximize_metric, metric=youden)
r <- plot_roc(cp)
y <- plot_metric(cp)
y <- y +
geom_vline(aes(xintercept = cp$optimal_cutpoint, color="red")) +
geom_text(aes(cp$optimal_cutpoint, 0, label = cp$optimal_cutpoint, hjust=-1, color="red")) +
theme(legend.position="none")
plot_grid(r,y)summary_continuous(indata=test_ss, field="age_at_recruitment_f21022_0_0",
type="num", xlab="Age (years)",
leg_pos=c(0.9,0.7), fillvar = "AF", fillname = "AF Status",
rel_heights = c(5.5,4.5,3,2.5), boxcol="white") #xlim=c(min_t, max_t)cp <- cutpointr(data=test_ss, x=age, class=AF,
method=maximize_metric, metric=youden)
r <- plot_roc(cp)
y <- plot_metric(cp)
y <- y +
geom_vline(aes(xintercept = cp$optimal_cutpoint, color="red")) +
geom_text(aes(cp$optimal_cutpoint, 0, label = cp$optimal_cutpoint, hjust=-1, color="red")) +
theme(legend.position="none")
plot_grid(r,y)# Correlation plot
M <- cor(test_ss[,c('Sex', 'age', 'Diabetes', 'Sleep_apnoea','Hypertension', 'AF_Status',
'Smoking_Never','Smoking_Previous', 'Smoking_Current',
'Alcohol_sdd',
'Obesity_Low', 'Obesity_Medium', 'Obesity_High',
'Physical_inactivity')])
order.M <- corrMatOrder(M, order = "AOE")
M <- M[order.M,order.M]
pl <- corrplot(M,
col=rev(colorRampPalette(pal_col)(10)),
method="circle",
type = "upper",
tl.pos = "td",
tl.cex = 0.5,
diag = FALSE,
tl.col="black")plUsing the lifestyle parameters (sex, age, obesity, smoking, alcoho, diabtes, sleep apnoe, and physical inactivity) and UMAP (Uniform Manifold Approximation) we explored how the samples clustered depending on their age group, AF status, and sex.
We tested the predictive power of each of the lifestyle variables independently with respect the AF Control/Cases in the test cohort. The results are based on logistic regression on the test sets without any adjusting and effect sizes are per standard deviation.
plot_or_data(or_data[[2]])tab_orOR (CI 95%) | p-value | |
Sex | 2.336 (2.244, 2.432) | 0.000 |
Age | 1.126 (1.122, 1.130) | 0.000 |
Obesity | ||
ObesityMedium | 1.519 (1.449, 1.593) | 0.000 |
ObesityHigh | 2.091 (1.999, 2.186) | 0.000 |
Smoking | ||
SmokingPrevious | 1.621 (1.557, 1.687) | 0.000 |
SmokingCurrent | 1.264 (1.181, 1.352) | 0.000 |
Alcohol sdd | 1.012 (1.011, 1.014) | 0.000 |
Hypertension | 6.260 (6.013, 6.518) | 0.000 |
Diabetes | 2.376 (2.221, 2.540) | 0.000 |
Sleep Apnoea | 3.835 (3.503, 4.192) | 0.000 |
Physical Inactivity | 1.087 (1.036, 1.141) | 0.001 |
A work by Baker Bioinformatics